substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

# get 1,1,0,0
get_sequence <- function(x, lead = 0, lag = 4) {
  treated.id <- x[x$V3 == 1 & x$V1 == (max(x$V1)-lead), ]$V2
  treated_sequence <- x$V3[which(x$V2 == treated.id)][1:lag]
  return(treated_sequence)
}


# newlist <- split(land_s, land_s$v1)
# 
# x <- newlist[[15]]
# 
# newnewlist <- split(x, x$year)
# xx <- newnewlist[[11]]

mat_compute <- function(place = place,
                        longitude = "long",
                        latitude = "lat"
) {
  # computing a distance matrix
  mat <- geosphere::distm(place[,c(longitude,latitude)],
               place[,c(longitude,latitude)],
               fun = distVincentyEllipsoid)
  return(mat)
}

price_distance <- function(place = place, 
                           mat = mat,
                           # longitude = "long", 
                           # latitude = "lat", 
                           price = c("final_price", "begi_price",
                                     "final_price_pa", "begi_price_pa",
                                     "price_difference", "price_difference_pa"),
                           id = "iid",
                           radius = 500
                           ) {
  
  mat[mat <= radius] <- 1
  mat[mat > radius] <- 0
  diag(mat) <- 0
  # assign row and col names with land ids
  row.names(mat) <- colnames(mat) <- place[,id]
  for (i in 1:length(price)){
    # get land ids whose price is not NA
    remaining_ids <- place[,id][which(is.na(place[,price[i]]) == FALSE)]
    # create the variable of interest with all NAs
    place[, paste0("avg_within_", radius, "_", price[i])] <- NA
    
    sub_mat <- mat[rownames(mat) %in% remaining_ids, 
                   colnames(mat) %in% remaining_ids]
    sub_price <- place[,price[i]][which(is.na(place[,price[i]]) == FALSE)]
    
    if (length(sub_mat) > 1) {
      overall <- colSums(sub_mat)
    } else if (length(sub_mat) == 1) {
      overall <- sub_mat
    } else {
      overall <- sub_mat <- 0
      
    }
    
    avg_within <- as.numeric(sub_mat %*% sub_price/overall)
    
    
    # sub_mat <- matrix(data = rep(0,16), nrow = 4, ncol = 4)
    # sub_mat[2,1] <- sub_mat[1,2] <- sub_mat[1,3] <- sub_mat[3,1] <- 1
    # sub_price <- c(3, 6, 7, 10)
    # 
    # avg_within <- as.numeric(sub_mat %*% sub_price/colSums(sub_mat))
    # 
    place[, paste0("avg_within_", radius, "_", price[i])][which(place[,id]%in%remaining_ids)] <- 
      avg_within
    
    place[,paste0(price[i], "_distance", "_", radius)] <- 
      abs(place[,price[i]] - place[, paste0("avg_within_", radius, "_", price[i])])
    place[,paste0(price[i], "_distance", "_", radius)][which(place[,paste0(price[i], "_distance", "_", radius)] == "NaN")] <- NA  
    cat(price[i], "done \n")
  }
  
  return(place)
}

price_distance_wrapper <- function(place, 
                                   longitude = "long",
                                   latitude = "lat",
                                   price = c("final_price", "begi_price",
                                             "final_price_pa", "begi_price_pa"),
                                   id = "iid",
                                   radius = c(1500, 500)
) {
  mat <- mat_compute(place = place, longitude = longitude,
                     latitude = latitude)
  cat("matrix computation done \n")
  for (i in 1:length(radius)) {
    place <- price_distance(place = place, mat = mat, 
                            price = price, id = id, 
                            radius = radius[i])  
    cat(radius[i], " done \n")
  }
  cat("all done \n")
  return(place)
}

GetJDnew <- function(address, apikey = '1zLeVq0CN38OhXR9xyWjQljU4GTtOe7k',
                     # payload, 
                     url = "http://api.map.baidu.com/geocoder/v2/",
                     header = c("User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")) {

  
  payload = list(
    'output' = 'json', 
    'ak' = apikey
  )
  payload[["address"]] <- address
  
  tryCatch({
    web <- GET(url,add_headers(.headers = header),query = payload)
    content <- web %>% content(as="text",encoding="UTF-8") %>% fromJSON(flatten = TRUE) %>% `[[`(2) %>% `[[`(1)
    addinfo <- content
    cat(address, "completes ",  "\n")
  },error = function(e){
    cat(address, "fails ",  "\n")
    addinfo <- list("lng" = NA ,"lat" = NA)
  })
  Sys.sleep(.002)
  return(addinfo)
}

GetJD <- function(address, 
                  apikey = '1zLeVq0CN38OhXR9xyWjQljU4GTtOe7k'){
  url = "http://api.map.baidu.com/geocoder/v2/"
  header  <- c("User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
  payload = list(
    'output' = 'json', 
    'ak' = apikey
  )
  addinfo <- data.frame()
  for (i in 1:length(address)){
    payload[["address"]]=address[i]
    # print(sprintf("正在抓取第【%s】个地址",i))
    tryCatch({
      web <- GET(url,add_headers(.headers = header),query = payload)
      content <- web %>% content(as="text",encoding="UTF-8") %>% fromJSON(flatten = TRUE) %>% `[[`(2) %>% `[[`(1)
      addinfo <- rbind(addinfo,content)
      cat("Task", i, "completes ", address[i], "\n")
    },error = function(e){
      cat("Task", i, "fails ", address[i], "\n")
      addinfo <- rbind(addinfo,list("lng" = NA ,"lat" = NA))
    })
    Sys.sleep(.01)
    
  }
  print("Mission Accomplished!")
  return(addinfo) 
}

GetAddressNew <- function(xy, apikey = '1zLeVq0CN38OhXR9xyWjQljU4GTtOe7k',
                          # payload, 
                          url = "http://api.map.baidu.com/geocoder/v2/",
                          header = c("User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")) {
  payload = list(
    'output' = 'json', 
    'ak' = apikey
  )
  payload[["location"]] <- sprintf('%.3f,%.3f',xy[1],xy[2])
  tryCatch({
    web <- GET(url,add_headers(.headers = header),query = payload)
    content <- web %>% content(as="text",encoding="UTF-8") %>% fromJSON(flatten = TRUE) %>% `[[`(2) %>% `[[`(1)
    addinfo <- content
    cat(xy, "completes ",  "\n")
  },error = function(e){
    cat(xy, "fails ",  "\n")
    addinfo <- list("lng" = NA ,"lat" = NA)
  })
  Sys.sleep(.002)
  return(addinfo)
  
}

GetAddress <- function(lddata, 
                       apikey = '1zLeVq0CN38OhXR9xyWjQljU4GTtOe7k'){
  url = "http://api.map.baidu.com/geocoder/v2/"
  header = c('User-Agent'= 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36')
  payload = list(
    'output' = 'json', 
    'ak' = apikey
  ) 
  addinfo = c() 
  for (i in 1:nrow(lddata)){
    payload[['location']] = sprintf('%.3f,%.3f',lddata[i,'lon'],lddata[i,'lat'])
    tryCatch({
      web <-  GET(url,add_headers(.headers = header),query = payload)
      addinfo <- web %>% content(as="text",encoding="UTF-8") %>% fromJSON(flatten = TRUE) %>%`[[`(2) %>%`[[`(2) %>% c(addinfo,.)
    },error = function(e){
      cat(sprintf("第【%d】个任务处理失败!",i),sep = "\n")
      addinfo <- c(addinfo,NA)
    })
    
  } 
  return(addinfo)
}


missing_indicator <- function(x, data) {
  data[, paste0(x, "_missing")] <- ifelse(is.na(data[, x]), 1, 0)
  data[, x] <- ifelse(is.na(data[, x]), 0, data[,x])
  return(data)
}

standardize <- function(x) {
  x <- (x - mean(x, na.rm = T))/sd(x, na.rm = T)
  return(x)
}

taking_log <- function(x, data, conven = 1){
  data[, paste0("log_",x)] <- log(data[, x] + conven)
  data[, paste0("log_",x)][which(data[, paste0("log_",x)] == "-Inf")] <- NA
  data[, paste0("log_",x)][which(data[, paste0("log_",x)] == "Inf")] <- NA
  data[, paste0("log_",x)][which(data[, paste0("log_",x)] == "NaN")] <- NA
  return(data)
}

residualize <- function(data = new_auction, var_name = var_name,
                        formula_character = 
                          "~ as.factor(city_id) + as.factor(year) + as.factor(forway) + log_area" ) {
  
  fit <- lm(as.formula(paste(var_name, formula_character)), data = data,
            na.action = na.exclude)
  
  data[,paste0("resid_",var_name)] <- residuals(fit, na.action = na.exclude)
  return(data)
}

residualize_lasso <- function(data = new_acution, var_name = var_name, 
                              cores = 6,
                              formula_character = 
                                "~ as.factor(city_id) + as.factor(year) + as.factor(forway) + log_area") {
  dmatrix <- model.matrix(as.formula(paste(var_name, formula_character)), data)[,-1]
  y <- as.numeric(na.omit(data[, var_name]))
  registerDoParallel(cores)
  cvfit <- cv.glmnet(y = y, x = dmatrix, type.measure = "mse", nfolds = 10, 
                               parallel = TRUE)
  predicted <- predict(cvfit, newx = dmatrix, s = "lambda.min")
  # colnames(predicted) <- var_name
  colnames(data)[colnames(data) == "predicted[match(rownames(data), rownames(predicted))]"]
  data <- cbind(data, predicted[match(rownames(data), rownames(predicted))])
  colnames(data)[colnames(data) == "predicted[match(rownames(data), rownames(predicted))]"] <- paste0("resid_",var_name)
  data[, paste0("resid_",var_name)] <- data[, var_name] - data[, paste0("resid_",var_name)]
  return(data)
}

residualize_lasso_sub <- function(data = place,
                                  price = c("log_final_price", "log_begi_price",
                                            "log_final_price_pa", "log_begi_price_pa",
                                            "log_price_difference", "log_price_difference_pa"),
                                  # price = "final_price_pa",
                                  formula_character = 
                                    "~  (as.factor(year_month) + as.factor(county_id) +
                                  as.factor(forway) + 
                                  commerce + industry + resi + other + 
                                  log_area +") {
  # first, get matrix
  mat <- mat_compute(place = data)
  # add 100 meters to all zeros 
  mat[mat == 0] <- mat[mat == 0] + 100
  # sum(is.infinite(mat))
  
  # inverse weights
  mat <- 1/mat
  diag(mat) <- 0
  mse_lassos <- rep(NA, length(price))
  for (i in 1:length(price)) {
    y <- data[,price[i]] 
    y[is.na(y)] <- 0
    data[, paste0(price[i], "_w")] <- mat %*% y
    
    # begin LASSO
    dmatrix <- tryCatch(model.matrix(as.formula(paste0(price[i], formula_character, paste0(price[i], "_w", ")^9"))), data)[,-1],
                        error = function(err) NA)
    y <- tryCatch(as.numeric(na.omit(data[,price[i]])), error = function(err) NA)
    set.seed(2019)
    
    cat("running lasso", "\n")
    cvfit_tax_ratio <- tryCatch(cv.glmnet(y = y, x = dmatrix, type.measure = "mse", nfolds = 5),
                                error = function(err) NA)
    predicted <- tryCatch(predict(cvfit_tax_ratio, newx = dmatrix, s = "lambda.min"),
                          error = function(err) NA)
    
    mse_lassos[i] <- tryCatch(mean((y - predicted)^2),
                              error = function(err) NA)
    
    data <- cbind(data, predicted[match(rownames(data), rownames(predicted))])
    
    data[, paste0("resid_",price[i])] <- data[, price[i]] - data[, "predicted[match(rownames(data), rownames(predicted))]"]   
    cat(paste0("resid_",price[i]), "done!", "\n")
  }
  names(mse_lassos) <- price
  return(list("data" =  data, "mse" = mse_lassos))
} 




residualize_ols_sub <- function(data = place,
                                  var_name = "final_price_pa",
                                  formula_character = 
                                    "~  as.factor(month) + as.factor(county_id) +
(as.factor(forway) + 
                                  commerce + industry + resi + other + 
                                  price_difference_w)^3 +
                                  log_area") {
  # first, get matrix
  mat <- mat_compute(place = data)
  # add 100 meters to all zeros 
  mat[mat == 0] <- mat[mat == 0] + 100
  # sum(is.infinite(mat))
  
  # inverse weights
  mat <- 1/mat
  diag(mat) <- 0
  
  y <- data[,var_name] 
  y[is.na(y)] <- 0
  data[, paste0(var_name, "_w")] <- mat %*% y
  
  fit <- lm(as.formula(paste(var_name, formula_character)), data = data,
            na.action = na.exclude)
  mse_ols <- mean((fit$model[, var_name] - fit$fitted.values )^2)
  data[,paste0("resid_",var_name)] <- residuals(fit, na.action = na.exclude)
  
  return(list("data" = data, 
              "mse" = mse_ols))
  
} 


# 
# 
# # tmp_price <- xx$price
# # nonas <- sum(is.na(tmp_price) == FALSE)
# # tmp_price[is.na(tmp_price)] <- 0
# # mat %*% tmp_price
# # 
# # distances <- distHaversine(p1 = c(land_s$x[885],
# #                                   land_s$y[885]), 
# #                            p2= c(land_s$x[888],
# #                                  land_s$y[888]),
# #                            r=6378137)
# starting_time <- Sys.time()
# mat <- distm(place[,c(longitude,latitude)],
#              place[,c(longitude,latitude)], 
#              fun = distVincentyEllipsoid)  
# mat[mat <= radius] <- 1
# mat[mat > radius] <- 0
# diag(mat) <- 0
# place$avg_within_price <- as.numeric(mat %*% place[,price]/colSums(mat))
# place$price_distance <- place$price - place$avg_within_price
# place$price_distance[which(place$price_distance == "NaN")] <- NA
# ending_time <- Sys.time()
# ending_time - starting_time

unfold <- function (data, time, event, cov, cov.names = paste("covariate",
                                                              ".", 1:ncovs, sep = ""), suffix = ".time", cov.times = 0:ncov,
                    common.times = TRUE, lag = 0, ...) {
  vlag <- function(x, lag) c(rep(NA, lag), x[1:(length(x) -
                                                  lag)])
  xlag <- function(x, lag) apply(as.matrix(x), 2, vlag, lag = lag)
  all.cov <- unlist(cov)
  if (!is.numeric(all.cov))
    all.cov <- which(is.element(names(data), all.cov))
  if (!is.list(cov))
    cov <- list(cov)
  ncovs <- length(cov)
  nrow <- nrow(data)
  ncol <- ncol(data)
  ncov <- length(cov[[1]])
  nobs <- nrow * ncov
  if (length(unique(c(sapply(cov, length), length(cov.times) -
                      1))) > 1)
    stop(paste("all elements of cov must be of the same length and \n",
               "cov.times must have one more entry than each element of cov."))
  var.names <- names(data)
  subjects <- rownames(data)
  omit.cols <- if (!common.times)
    c(all.cov, cov.times)
  else all.cov
  keep.cols <- (1:ncol)[-omit.cols]
  factors <- names(data)[keep.cols][sapply(data[keep.cols],
                                           is.factor)]
  levels <- lapply(data[factors], levels)
  first.covs <- sapply(cov, function(x) x[1])
  factors.covs <- which(sapply(data[first.covs], is.factor))
  levels.covs <- lapply(data[names(factors.covs)], levels)
  nkeep <- length(keep.cols)
  if (is.numeric(event))
    event <- var.names[event]
  events <- sort(unique(data[[event]]))
  if (length(events) > 2 || (!is.numeric(events) && !is.logical(events)))
    stop("event indicator must have values {0, 1}, {1, 2} or {FALSE, TRUE}")
  if (!(all(events == 0:1) || all(events == c(FALSE, TRUE)))) {
    if (all(events = 1:2))
      data[[event]] <- data[[event]] - 1
    else stop("event indicator must have values {0, 1}, {1, 2} or {FALSE, TRUE}")
  }
  times <- if (common.times)
    matrix(cov.times, nrow, ncov + 1, byrow = TRUE)
  else as.matrix(data[, cov.times])
  new.data <- matrix(Inf, nobs, 3 + ncovs + nkeep)
  rownames <- rep("", nobs)
  colnames(new.data) <- c("start", "stop", paste(event, suffix,
                                                 sep = ""), var.names[-omit.cols], cov.names)
  end.row <- 0
  data <- as.matrix(as.data.frame(lapply(data, as.numeric)))
  for (i in 1:nrow) {
    start.row <- end.row + 1
    end.row <- end.row + ncov
    start <- times[i, 1:ncov]
    stop <- times[i, 2:(ncov + 1)]
    event.time <- ifelse(stop == data[i, time] & data[i,
                                                      event] == 1, 1, 0)
    keep <- matrix(data[i, -omit.cols], ncov, nkeep, byrow = TRUE)
    select <- apply(matrix(!is.na(data[i, all.cov]), ncol = ncovs),
                    1, all)
    rows <- start.row:end.row
    cov.mat <- xlag(matrix(data[i, all.cov], nrow = length(rows)),
                    lag)
    new.data[rows[select], ] <- cbind(start, stop, event.time,
                                      keep, cov.mat)[select, ]
    rownames[rows] <- paste(subjects[i], ".", seq(along = rows),
                            sep = "")
  }
  row.names(new.data) <- rownames
  new.data <- as.data.frame(new.data[new.data[, 1] != Inf &
                                       apply(as.matrix(!is.na(new.data[, cov.names])), 1, all), ])
  for (fac in factors) {
    new.data[[fac]] <- factor(levels[[fac]][new.data[[fac]]])
  }
  fcv <- 0
  for (cv in factors.covs) {
    fcv <- fcv + 1
    new.data[[cov.names[cv]]] <- factor(levels.covs[[fcv]][new.data[[cov.names[cv]]]])
  }
  new.data
}
